install.packages(c("rio", "tidyverse", "janitor", "lubridate", "rmarkdown", "fs", "hms"))Nomilo Fishpond Biogoechemical Analysis
Throughout this document, hover over the numbered annotations to the right of code chunks to reveal detailed explanations and comments about the code. Where drop-down italicized text is present, expand by pressing on arrow to see code.
Install Packages
Load Libraries
library(rio)
library(tidyverse)
library(janitor)
library(lubridate)
library(rmarkdown)
library(fs)
library(hms)- 1
- For importing excel data
- 2
- For cleaning of data
- 3
- For cleaning variable names
- 4
- For cleaning dates
- 5
- For displaying tables
- 6
- For file path usage
Import Raw Data
Procedure
Define vector of files to import:
files_to_import <- dir_ls(path = "data/raw")
for (i in seq_along(files_to_import)) {
cat(i, "= ", files_to_import[i], "\n")
}- 1
-
Store the file paths of our raw data within the
data/rawdirectory infiles_to_import - 2
- Print each file path with its index
1 = data/raw/2024-02-28_dfs.RData
2 = data/raw/2024-02-28_ksf-clam-growth.xlsx
3 = data/raw/2024-02-28_ksf-compiled-data.xlsx
4 = data/raw/2024-02-28_ksf-oyster-cylinder-growth.xlsx
5 = data/raw/2024-02-28_profile-data.xlsx
6 = data/raw/2024-02-28_water-samples.xlsx
7 = data/raw/2024-02-28_weather-data.xlsx
8 = data/raw/2024-03-01_dfs-no-profiles.RData
9 = data/raw/2024-03-01_dfs_no_profiles.RData
10 = data/raw/2024-03-04_dfs-no-profiles.RData
11 = data/raw/~$2024-02-28_weather-data.xlsx
Use the purrr::map() function to iteratively import files in the files_to_import vector except for the profiles data and .RData files:
The @iteratively-import-raw-data code chunk should only be ran once when raw data is updated because it takes long to execute. Therefore, run the @efficiently-load-raw-data code chunk instead to easily import up-to-date raw data.
dfs_no_profiles <- map(files_to_import[c(2:4, 6, 7)], import_list)
current_date <- format(Sys.Date(), "%Y-%m-%d")
save(dfs_no_profiles, file = paste0("data/raw/", current_date, "_dfs-no-profiles.RData"))Refer to the output of the files_to_import data object to ensure you are inputting the correct index value corresponding to the file path that needs to be loaded.
Efficiently import up-to-date raw data:
load(files_to_import[10])Rename datasets:
We will always use snakecase when naming our data objects and functions (e.g., data_object_name or function_name()).
names(dfs_no_profiles) <- gsub("data/raw/2024-02-28_|\\.xlsx$|\\.xls$", "",
files_to_import[c(2:4, 6, 7)])
names(dfs_no_profiles) <- gsub("-", "_", names(dfs_no_profiles))
names(dfs_no_profiles)- 1
- Remove prefixes and file extensions
- 2
- Replace hyphens with underscores
- 3
- Check if names were outputted correctly
[1] "ksf_clam_growth" "ksf_compiled_data"
[3] "ksf_oyster_cylinder_growth" "water_samples"
[5] "weather_data"
Rename each sheet within each raw dataset to be lowercased and replace spaces with underscores:
dfs_no_profiles <- map(dfs_no_profiles, ~ set_names(.x, gsub(" ", "_", tolower(names(.x)))))Create separate datasets by specifying the Excel sheet from each spreadsheet we want to tidy:
ksf_clam_growth_data <- dfs_no_profiles$ksf_clam_growth$sheet1
ksf_compiled_data <- dfs_no_profiles$ksf_compiled_data$full_data
ksf_oyster_cylinder_growth_data <- dfs_no_profiles$ksf_oyster_cylinder_growth$sheet1
water_samples_data <- dfs_no_profiles$water_samples$data_overview
weather_data <- dfs_no_profiles$weather_data$weather_ksf
tidal_data <- dfs_no_profiles$ksf_compiled_data$tidesWe want to combine multiple sheets within the profiles Excel spreadsheet into one, therefore, we will import it separately:
sheets_to_import <- c("L1", "L2", "L3", "L4")
profiles_data <- profiles_data <- map_dfr(sheets_to_import, function(sheet_name) {
import(files_to_import[5], which = sheet_name)
}) %>%
bind_rows()- 1
- [code annotation]
- 2
- [code annotation]
- 3
- [code annotation]
View Raw Data
Tidy Raw Data
Tidying Processes
Steps to clean data
new_clam_var_names <- c(
"sort_date", "color", "clams_in_count", "clams_in_lbs", "clams_in_avg_per_lb",
"clams_out_count", "clams_out_lbs", "clams_out_avg_per_lb", "growth_in_lbs",
"growth_pct", "sr", "days_btwn_sort"
)
new_clam_date_col <- c(
"2023-10-17", "2023-12-06", "2023-12-12", "2024-01-02", "2024-01-10", "2024-01-24",
"2024-01-31", "2024-02-08", "2024-02-13"
)
ksf_clam_growth_data_tidied <- ksf_clam_growth_data %>%
slice(-1) %>%
setNames(new_clam_var_names) %>%
mutate(date = as.Date(new_clam_date_col)) %>%
dplyr::select(-sort_date) %>%
pivot_longer(
cols = c(
clams_in_count, clams_in_lbs, clams_in_avg_per_lb, clams_out_count,
clams_out_lbs, clams_out_avg_per_lb
),
names_to = c("stage", ".value"),
names_prefix = "clams_",
names_sep = "_",
values_to = "value"
) %>%
mutate(stage = if_else(str_detect(stage, "in"), "In", "Out")) %>%
rename(avg_per_lbs = avg) %>%
mutate(across(c(color, stage), as.factor)) %>%
mutate(across(c(count, lbs, avg_per_lbs, growth_in_lbs, growth_pct, sr),
~as.numeric(gsub("%", "", .)))) %>%
arrange(date, color, stage) %>%
dplyr::select(date, days_btwn_sort, color, stage, count, lbs, avg_per_lbs,
growth_in_lbs, growth_pct, sr) %>%
rename("days_btwn_clams_sort" = days_btwn_sort,
"clams_color" = color,
"clams_stage" = stage,
"clams_count" = count,
"weight" = lbs,
"avg_weight" = avg_per_lbs,
"clams_growth" = growth_in_lbs,
"clams_sr" = sr)
paged_table(ksf_clam_growth_data_tidied)- 1
- Manually set variable names
- 2
- Assign dates to new date column
- 3
- Delete first row
- 4
- Set date as correct variable type and pivot data set based on date range.
- 5
- Assign In and Out to stage
- 6
- Rename variable of average to average per lbs
- 7
- Set stage and color as factor variable types
- 8
- Set variables as numeric variable types
- 9
- Arrange values by date, color, and stage
Steps to clean data
ksf_compiled_data_tidied <- ksf_compiled_data %>%
rename_with(~gsub("\\s*\\([^\\)]+\\)", "", .x)) %>%
janitor::clean_names() %>%
rename(date = date_time) %>%
mutate(date = as.Date(date)) %>%
filter(date >= as.Date("2023-11-20") & date <= as.Date("2024-02-20")) %>%
arrange(date) %>%
dplyr::select(-c(external_voltage, wk_num, wind_dir,
spadd, outdoor_temperature, hourly_rain,
solar_radiation, resistivity, battery_capacity,
hour, daynum, data_pt, wind_sp, diradd,
wind_speed, wind_direction, tide, day, month, year)
) %>%
dplyr::select(where(~ !anyNA(.))) %>%
group_by(date) %>%
summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
rename("ksf_salinity" = salinity,
"ksf_rdo_saturation" = rdo_saturation,
"ksf_rdo_concentration" = rdo_concentration,
"ksf_actual_conductivity" = actual_conductivity,
"ksf_total_dissolved_solids" = total_dissolved_solids,
"ksf_ammonium" = ammonium,
"ksf_barometric_pressure" = barometric_pressure,
"ksf_oxygen_partial_pressure" = oxygen_partial_pressure,
"ksf_specific_conductivity" = specific_conductivity,
"ksf_density" = density,
"ksf_chlorophyll_a_fluorescence" = chlorophyll_a_fluorescence,
"ksf_ammonium_m_v" = ammonium_m_v)
paged_table(ksf_compiled_data_tidied)- 1
- Clean variable names by removing everything in parentheses, using lowercase and underscores in place of spaces
- 2
-
Rename the
date_timevariable todate, filter to desired date range and sort bydate - 3
- Remove unnecessary variables
- 4
- Remove columns with containing all NA values
- 5
-
Group by
dateand calculate the average of every variable for each day
Steps to clean data
oyster_var_names <- c(
"date", "oyster_large_weight", "oyster_large_gain", "oyster_small_weight",
"oyster_small_gain", "oyster_chlorophyll"
)
ksf_oyster_cylinder_growth_data_tidied <- ksf_oyster_cylinder_growth_data %>%
dplyr::select(c(1, 4, 5, 8, 9, 12)) %>%
slice(-1) %>%
setNames(oyster_var_names) %>%
pivot_longer(
cols = c(oyster_large_weight, oyster_large_gain,
oyster_small_gain,
oyster_small_weight),
names_to = c("oyster_size", ".value"),
names_prefix = "oyster_",
names_sep = "_",
values_to = "value"
) %>%
mutate(oyster_size = if_else(str_detect(oyster_size, "small"), "Small", "Large")) %>%
mutate(date = as.Date(date),
oyster_size = as.factor(oyster_size),
across(c(weight, gain), as.numeric)
) %>%
filter(date >= as.Date("2023-11-20") & date <=
as.Date("2024-02-14")) %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
mutate(weight = weight * 0.00220462) %>%
rename("growth_pct" = gain)
paged_table(ksf_oyster_cylinder_growth_data_tidied)- 1
- Manually set variable names
- 2
- Select desired columns and remove first row
- 3
- Convert from wide to long format
- 4
- Create a new variable that differentiates oyster size
- 5
- Adjust data types to numeric and factor
- 6
- Filter to desired date range
- 7
- Fill NA values as 0
Address the NA values before merge
Steps to clean data
water_samples_data_tidied <- water_samples_data %>%
slice(-c(44:52)) %>%
rename_with(~gsub("\\s*\\([^\\)]+\\)", "", .x)) %>%
janitor::clean_names() %>%
mutate(
date = if_else(date == "44074",
as.character(as.Date("2024-01-09")),
format(dmy(date), "%Y-%m-%d"))
) %>%
mutate(sample_id = 1:nrow(.)) %>%
mutate(date = as.Date(date),
across(c(nomilo_id, location, round, depth), as.factor),
tube_name = as.character(tube_name),
tube_name = ifelse(is.na(tube_name), "No Name", tube_name),
tube_name = as.factor(tube_name)) %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
select(-c(1,2))
paged_table(water_samples_data_tidied)- 1
- Clean variable names by removing everything in parentheses, using lowercase and underscores in place of spaces
- 2
- Replaces incorrect date values and format as YYYY-MM-DD
- 3
- Add values for sample ID
- 4
- Set correct variable types
- 5
- Replaces NA values with 0
Steps to clean data
weather_data_tidied <- weather_data %>%
janitor::clean_names() %>%
unite(date, year, month, day, sep = "-") %>%
mutate(date = ymd(date)) %>%
select(-c(1, 3)) %>%
rename("outdoor_temperature" = outdoor_temp_f) %>%
mutate(outdoor_temperature = (outdoor_temperature - 32) * (5/9)) %>%
group_by(date) %>%
summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
slice(-1)
paged_table(weather_data_tidied)- 1
- Clean variable names
- 2
- Merge separate day, month, year columns into one column variable and format as YYYY-MM-DD.
- 3
- Cut columns
- 4
- Rename outdoor temperature and convert from Fahrenheit to Celcius
- 5
- Group by date and then take average values per day
- 6
- Cut first row
Steps to clean data
new_profile_var_names <- c("depth", "water_temperature", "dissolved_oxygen", "salinity", "conductivity", "visibility", "location", "date")
profiles_data_tidied <- profiles_data %>%
select(-c(6, 8)) %>%
mutate(
temp_column1 = NA_character_,
temp_column2 = NA_character_
) %>%
setNames(new_profile_var_names) %>%
mutate(
location = ifelse(depth == "Location", water_temperature, NA_character_),
date = ifelse(depth == "Date", water_temperature, NA_character_)
) %>%
fill(location, date, .direction = "down") %>%
filter(depth != "Location", depth != "Date") %>%
mutate(
location = case_when(
location == "L1 Northwest buoy" ~ "back buoy",
location == "L2 Middle Buoy" ~ "mid buoy",
location == "L3 Production Dock" ~ "production dock",
location == "L4 Auwai" ~ "auwei",
TRUE ~ location
),
date = case_when(
date %in% c("45258", "2023-11-28") ~ "2023-11-28",
date %in% c("45282", "2023-12-21") ~ "2023-12-22",
date %in% c("45536", "2024-01-09") ~ "2024-01-09",
date %in% c("30/1/24", "30/01/24") ~ "2024-01-30",
date %in% c("20/02/24", "20/2/24") ~ "2024-02-20",
TRUE ~ date
)) %>%
mutate(
date = as.Date(date, format = "%Y-%m-%d"),
conductivity = case_when(
row_number() %in% c(1:11) ~ NA_character_,
TRUE ~ as.character(conductivity)
)
) %>%
filter(!(depth %in% c("Samples", "Depth"))) %>%
mutate(date = as.Date(date),
across(c(depth, location), as.factor),
across(c(water_temperature, dissolved_oxygen, salinity,
conductivity,visibility), as.numeric)) %>%
fill(visibility, .direction = "down") %>%
mutate(visibility = if_else(date == "2023-11-28", 0, visibility)) %>%
mutate_if(is.numeric, ~replace_na(., 0))
paged_table(profiles_data_tidied)- 1
- Set new variable names manually
- 2
- Delete unnecessary columns
- 3
- Temporarily create two new columns to replace the ones we deleted
- 4
- Assign new profile variable names to rename variables in data set
- 5
- Takes location from one column of location and date data, and assigns it to corresponding data of another column.
- 6
- Fill values of temperature downwards in newly created date and location column.
- 7
- Gets rid of the ‘location’ and ‘date’ rows that had empty values.
- 8
- Renames values
- 9
- Removes turbidity data rows 1:11
- 10
- Looks for rows containing samples and depth names and negate these values.
- 11
- Sets correct data types for each variable
- 12
- Fills values from the temperature downwards into the newly created columns for date and location
- 13
- Sets missing data and NA values as 0
Steps to clean data
tidal_data_tidied <- tidal_data %>%
janitor::clean_names() %>%
mutate(date = as.Date(date, format = "%Y-%m-%d")) %>%
filter(date >= as.Date("2023-11-20") & date <= as.Date("2024-02-20")) %>%
select(-2) %>%
mutate(time = as_hms(format(time, "%H:%M:%S")),
high_low = as.factor(high_low))
paged_table(tidal_data_tidied)- 1
- Clean variable names
- 2
- Set date as correct variable type and format YYYY-MM-DD
- 3
- Filter to desired date range
- 4
- Cut column
- 5
- Set time as time variable type
- 6
- Set variable as factor type
Merge Tidied Datasets
first_merge <- reduce(list(ksf_clam_growth_data_tidied, ksf_oyster_cylinder_growth_data_tidied),
full_join, by = c("date", "weight", "growth_pct"))
second_merge <- reduce(list(first_merge, profiles_data_tidied, water_samples_data_tidied),
full_join)
final_merge <- reduce(list(second_merge, ksf_compiled_data_tidied, tidal_data_tidied, weather_data_tidied),
full_join, by = "date")Export Tidied Datasets
Export tidied datasets to CSV into data/tidied folder:
source("code/functions/export_to_csv.R")
dfs_to_export <- list(
ksf_clam_growth_data_tidied = ksf_clam_growth_data_tidied,
ksf_compiled_data_tidied = ksf_compiled_data_tidied,
ksf_oyster_cylinder_growth_data_tidied = ksf_oyster_cylinder_growth_data_tidied,
water_samples_data_tidied = water_samples_data_tidied,
profiles_data_tidied = profiles_data_tidied
)
imap(dfs_to_export, ~ export_to_csv(.x, .y, "data/tidied"))- 1
- List of dataframes we want to export as CSV files
- 2
-
Iterate the
export_to_csv(df, df_name, dir_path)function over each dataframe..xrefers to the dataframe..yrefers to the name of the dataframe. These are passed toexport_to_csv()function along with the desired directory path.
Export merged final data set into data/outputs folder.